#!/bin/bash

# check and notify users of long running processes on the mentat systems
# this script should run on one system
# if run manually, set the MANUAL_NOTIFY Env. to tell what's going on

#    (c) 2011 Donders Centre/Simon Oosthoek
#
#    This file is part of PIM.
#
#    PIM is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 2 of the License, or
#    (at your option) any later version.
#
#    PIM is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with PIM.  If not, see <http://www.gnu.org/licenses/>.
#
#TODO:

TESTMODE=false
if [ $# -gt 0 ]
then
	TESTMODE=true
fi

if [ -f /opt/cluster/pim_lib ]
then
	. /opt/cluster/pim_lib
elif [ -f /mnt/opt/cluster/pim_lib ]
then
	. /mnt/opt/cluster/pim_lib
else
	echo "cannot load pim_lib" >&2
	exit 1
fi

if [ -f $pim_conffile ]
then
	. $pim_conffile
else
	echo "$PROG: cannot load $pim_conffile" >&2
	exit 1
fi

mailtempl_longrun=$pim_configdir/mail_longrunning.tpl

DEBUGNOTIFY=false
mailreport="/tmp/$PROG-report-$$"
#initialise mailreport for sendmail -t
echo "Subject: " >>$mailreport
echo "To: $ADMINMAIL" >>$mailreport
echo >>$mailreport

function fill_template {
	templ="$1"
	subject="$2"
	to_email="$3"
	days=$(echo $pid_elapsedtime|sed 's/-.*$//')
	url_hostinfo="${wui_urlbase}/pidinfo.php?host=$pid_machine"
	cat $mailtempl_longrun | 
	sed 	-e "/__subject__/s//$subject/g" \
		-e "/__to_email__/s//$to_email/g" \
		-e "/__user__/s//$pid_ownername/g" \
		-e "/__days__/s//$days/g" \
		-e "/__procname__/s//$procname/g" \
		-e "/__pid_machine__/s//$pid_machine/g" \
		-e "/__pid_pid__/s//$pid_pid/g" \
		-e "/__url_info__/s,,$url_info," \
		-e "/__url_kill__/s,,$url_kill," \
		-e "/__cmd_kill__/s//$cmd_kill/" \
		-e "/__url_renew__/s,,$url_renew," \
		-e "/__cmd_renew__/s//$cmd_renew/" \
		-e "/__url_hostinfo__/s,,${wui_urlbase}/pidinfo.php?host=$pid_machine," \
		-e "/__time_notified__/s//$time_notified/g"
}

function sendmail_oldproc {
	subject="[PIM] long running process on $pid_machine" 
	to_email="$pid_owner"
	days=$(echo $pid_elapsedtime|sed 's/-.*$//')
	url_hostinfo="${wui_urlbase}/pidinfo.php?host=$pid_machine"
	if fill_template "$mailtempl_longrun" "$subject" "$to_email" >/dev/null
	then
		if [ $TESTMODE = false ]
		then
			fill_template "$mailtempl_longrun" "$subject" "$to_email" | 
			$SENDMAIL -t 
			subject="[PIM] admincopy long running process on $pid_machine" 
			to_email="$ADMINMAIL"
			fill_template "$mailtempl_longrun" "$subject" "$to_email" | 
				$SENDMAIL -t 
		else
			echo long running pid: $pid_machine/$pid_pid for $pid_owner
		fi
	else
		echo "$PROG: error filling template for e-mail" >&2
	fi
}


# clean files of offline machines
for machine in $(ls $pim_datadir |grep '^mentat' |sed 's/^\(mentat[0-9]*\).*/\1/' |sort -u)
do
	if ping -c 1 $machine &>/dev/null 
	then
		continue
	else
		# move data files to olddata
		mv $pim_datadir/${machine}_* $pim_olddatadir
		# move user action files for this machine's processes:
		for f in $pim_useract/${machine}_*
		do
			mv $f $pim_olddatadir/${f}_action
		done
	fi
done

#cleanup old user action files
for f in $pim_useract/*
do
	basef=$(basename $f)
	if [ ! -f $pim_datadir/$basef ]
	then
		quiet_mv $f $pim_olddatadir/${basef}_action
	fi
done

#cleanup old index files if corresponding pid file is gone
for f in $pim_datadir/*_*_*
do p=$(basename $f|sed 's,.*_\(mentat.*\),\1,')
	if [ ! -f $pim_datadir/$p ]
	then 
		rm $f
	fi
done 

# check all pidfiles 
for pidfile in $pim_datadir/mentat*
do
	if [ $TESTMODE = true ]
	then
		echo processing $pidfile >&2
	fi
	if [ ! -f "$pidfile" ]
	then
		# probably no files exist for this machine
		echo "DEBUG: no files exist for this machine: $pidfile" >>$mailreport
		continue
	fi

	#make sure we reset variables which we test for and which are not
	#always overwritten 
	pid_ownername="" 
	pid_notified="0"
	pid_randval=""
	pid_renewed="0"
	user_renew="0"
	user_kill="0"

	# source the pid_file
	. "$pidfile"
	
        # source user interaction file, if present
        actionfile=$pim_useract/$(basename $pidfile)
        if [ -f $actionfile ]
        then
		. $actionfile
	fi

	# test whether user must be excluded from the notify regime
	if exclude_user $pid_owner
	then
		if [ $TESTMODE = false ]
		then
			echo "pid_excluded=$EPOCHNOW" >>$pidfile
		else
			echo "pid excluded: $pid_pid on $pid_machine for $pid_owner"
		fi
		continue
	fi

	if [ -f "$pid_configused" ]
	then
		# source the config file used	
		. "$pid_configused"
	else
		echo "$pid_pid on $pid_machine for $pid_owner error: $pid_configused" >>$mailreport
		continue
	fi

	#skip if a user filter exists and applies to this user
	if [ -n "$only_for_groups" ]
	then
		if id -Gn $pid_owner|grep -Ew "$only_for_groups" >/dev/null
		then
			echo "user is member of a monitored group" >/dev/null
		else
			#echo "user $pid_owner group-ignored ($pid_machine $pid_pid $pid_elapsedtime)" >>$mailreport
			continue
		fi
	fi

	#skip if a user filter exists and applies to this user
	if [ -n "$only_for_users" ]
	then
		#check if owner in list of monitorable users
		if echo $pid_owner|grep -Ew "$only_for_users" >/dev/null
		then
			echo "user's processes will be managed" >/dev/null
		else
			#echo "user $pid_owner user-ignored ($pid_machine $pid_pid $pid_elapsedtime)" >>$mailreport
			continue
		fi
	fi
	
	#start state machine stuff
	state="unknown"
	#skip this if pid not running for time_initial
	if [ $(etime2sec "$pid_elapsedtime") -lt $(($time_initial*$SECPERDAY)) ]
	then
		state="initial"
		#echo "$pid_pid on $pid_machine for $pid_owner: pid not running for $time_initial days: $pid_elapsedtime" >>$mailreport
		continue
	fi

	if [ "$user_renew" -gt 0 ]
	then	
		if [ $pid_renewed -lt $user_renew ]
		then 
			if [ $pid_randval = $user_key ]
			then
				# renew verified, set pid_renewed and renew process
				pid_renewed=$user_renew
				if [ $TESTMODE = false ]
				then
					renew_pid  "$pidfile"  $user_renew
				fi
				echo "renewed $pid_pid for $pid_owner on $pid_machine" >>$mailreport
			else 
				echo "renew failed $pid_pid for $pid_owner on $pid_machine" >>$mailreport
				cat <<- EOF |$SENDMAIL -t 
					Subject: [PIM] idnotif: renew failed for $pid_owner; key mismatch 
					To: $ADMINMAIL
					From: $USER

					         pid=$pid_pid
					     machine=$pid_machine
					   pid_owner=$pid_owner
					pid_notified="$pid_notified" $(epoch2date $pid_notified)
					 pid_renewed="$pid_renewed" $(epoch2date $pid_renewed)
					  user_renew="$user_renew" $(epoch2date $user_renew)
					 pid_randval="$pid_randval"
					    user_key="$user_key"
					   user_kill="$user_kill" $(epoch2date $user_kill)
				EOF
			fi
		fi
	fi

	if [ $pid_notified -gt 0 -a $pid_renewed -gt $pid_notified ]
	then
		state="extended"
		if [ $pid_renewed -gt $(( $EPOCHNOW - ($time_extended*$SECPERDAY) )) ]
		then
			echo "$pid_pid on $pid_machine for $pid_owner: pid renewed as of $(epoch2date $pid_renewed) for $time_extended days. $pid_elapsedtime" >>$mailreport
			continue
		fi	
	elif [ $pid_notified -gt 0 ]
	then
		state="notified"
		#echo "$pid_pid on $pid_machine for $pid_owner: already notified $(epoch2date $pid_notified). $pid_elapsedtime" >>$mailreport
		continue
	fi
	

	# skip notify if kill has been issued as last command by user:
	if [ $state = "extended" -a "$user_kill" -gt "$pid_renewed" ]
	then
		echo "$pid_pid on $pid_machine for $pid_owner skip notify, kill is last command kill:$user_kill renew:$user_renew" >>$mailreport 

		continue
	fi
	
	# there is no valid kill command for this process
	# state is either initial or extended and user needs a notification
	pid_renewbefore=$(($EPOCHNOW + $time_notified))

	# starting to compose the notification e-mail to the user:
	#random value to obfuscate the base64 even more:
	randval=$(mcookie |sed 's/^.*\(............\).*$/\1/')
	#reduce the mentatnode to the number (size issues with base64)
	mentatnr=$(echo $pid_machine |sed 's/mentat//')
	
	#compose URLs the user can click on:
	urlkey=$(echo $mentatnr $pid_pid $pid_owner rnew $randval |openssl enc -base64)
	url_renew="${wui_urlbase}/pidaction.php?key=$urlkey"
	cmd_renew="pim \"$urlkey\""
	urlkey=$(echo $mentatnr $pid_pid $pid_owner kill $randval |openssl enc -base64)
	url_kill="${wui_urlbase}/pidaction.php?key=$urlkey"
	cmd_kill="pim \"$urlkey\""
	url_info="${wui_urlbase}/pidinfo.php?host=$pid_machine\&pid=$pid_pid"
	#echo $urlkey
	echo "$url_info" >>$mailreport	

	#set meaningful string to inform user when we last saw the pid:
	lastcheck=$(awk -vtimestamp="$pid_timestamp" 'BEGIN {print strftime("%F %R", timestamp)}')
	
	# get full username, if we don't have it yet
	if [ "x$pid_ownername" = "xna" -o -z "$pid_ownername" ]
	then
		pid_ownername="$(finger $pid_owner |sed -n '/Login.*Name: \(.*\)$/s//\1/p')"
		echo "pid_ownername=\"$pid_ownername\"" >>$pidfile
	fi
	
	# add to report and send mail to owner (if not in debug mode)
	echo "$pid_pid on $pid_machine for $pid_owner etime:$pid_elapsedtime (>$time_initial) send mail to $pid_ownername " >>$mailreport
	if [ $DEBUGNOTIFY != "true" ]
	then
		if [ $TESTMODE = "false" ]
		then
			# here we interact with the world: inform user, set notification into file
			echo "pid_notified=\"$EPOCHNOW\"" >>$pidfile
			echo "pid_notified_human=\"$(epoch2date $EPOCHNOW)\"" >>$pidfile
			echo "pid_randval=\"$randval\"" >>$pidfile
		fi
		#echo sending mail to $pid_owner
		sendmail_oldproc
		echo "$pid_pid: notify mail sent to $pid_ownername " >>$mailreport
	fi

	# for now, don't bother looking into memory problems
	continue

done

if [ $DEBUGNOTIFY = "true" ]
then
	#echo debugmail will be sent
	cat $mailreport | sed '/^Subject: /s//& [PIM] Debug notify report/' | 
	$SENDMAIL -t
else
	#echo normal report will be sent
	cat $mailreport | sed '/^Subject: /s//& [PIM] notify report '"$(date)/" | $SENDMAIL -t 

fi

if [ $TESTMODE = "true" ]
then
	cat $mailreport
fi
rm -f $mailreport
	
